Load the dataset we want.

library(tidyverse)
## Loading tidyverse: ggplot2
## Loading tidyverse: tibble
## Loading tidyverse: tidyr
## Loading tidyverse: readr
## Loading tidyverse: purrr
## Loading tidyverse: dplyr
## Warning: package 'dplyr' was built under R version 3.4.2
## Conflicts with tidy packages ----------------------------------------------
## filter(): dplyr, stats
## lag():    dplyr, stats
library(janitor)
library(ggridges)
library(ggthemes)


library(stringr)
library(dplyr)
library(forcats)

#embedding plots in rmarkdown
knitr::opts_chunk$set(fig.width=12, fig.height=8, out.width = "80%")
theme_set(theme_bw())

First we import the data and clean it.

health = 
  readxl::read_xls('../food_enviroment_atlas.xls', sheet = 'HEALTH') %>%
  clean_names() %>%
  select(1:7)

socioeconomic = 
  readxl::read_xls('../food_enviroment_atlas.xls', sheet = 'SOCIOECONOMIC') %>%
  clean_names() %>%
  select(1:3, 10:18)

assistance = 
  readxl::read_xls('../food_enviroment_atlas.xls', sheet = 'ASSISTANCE') %>%
  clean_names() %>%
  select(1:3, 23:29)

restaurant = 
  readxl::read_xls('../food_enviroment_atlas.xls', sheet = 'RESTAURANTS') %>%
  clean_names() %>%
  select(1:9, 16:17)

county =
  readxl::read_xls('../food_enviroment_atlas.xls', sheet = 'Supplemental Data - County') %>%
  clean_names()

state =
  readxl::read_xls('../food_enviroment_atlas.xls', sheet = 'Supplemental Data - State') %>%
  clean_names() %>%
  select(1:2, 9:20, 27:40)

store = 
  readxl::read_xls('../food_enviroment_atlas.xls', sheet = 'STORES') %>%
  clean_names() %>%
  select(1:27)

Socioeconomic VS diabetes and obesity

health_state = health %>%
  group_by(state) %>%
  summarise(pct_diabetes_adults08 = mean(pct_diabetes_adults08),
            pct_diabetes_adults13 = mean(pct_diabetes_adults13),
            pct_obese_adults08 = mean(pct_obese_adults08),
            pct_obese_adults13 = mean(pct_obese_adults13))
  
socioeconomic_state = socioeconomic %>% 
  group_by(state) %>%
  summarise(pct_65older10 = mean(pct_65older10),
            pct_18younger10 = mean(pct_18younger10),
            medhhinc15 = mean(medhhinc15),
            povrate15 = mean(povrate15),
            childpovrate15 = mean(childpovrate15),
            perpov10 = mean(perpov10)/n(),
            perchldpov10 = mean(perchldpov10)/n())

social_health_whole = merge(socioeconomic, health,by=c("fips", "state", "county"))
social_health = merge(socioeconomic_state, health_state,by=c("state"))

#normally distributed 
hist(health$pct_obese_adults13)

hist(health$pct_diabetes_adults13)

# median income VS obesity
social_health_whole %>%
  group_by(state) %>%
  ggplot(aes(x = medhhinc15, y = pct_obese_adults13)) +
  geom_point(aes(color = state, size = 1), alpha = .6) +
  geom_smooth() +
  labs(
    x = "Median household income, 2015",
    y = "Percentage of adult obesity, 2013 ")  +  
  theme(text = element_text(size = 14), 
        axis.text.x = element_text(size = 10), 
        axis.text.y = element_text(size = 10))
## `geom_smooth()` using method = 'gam'
## Warning: Removed 4 rows containing non-finite values (stat_smooth).
## Warning: Removed 4 rows containing missing values (geom_point).

# median income VS diabetes
social_health_whole %>%
  group_by(state) %>%
  ggplot(aes(x = medhhinc15, y = pct_diabetes_adults13)) +
  geom_point(aes(color = state, size = 1), alpha = .6) +
  geom_smooth() +
  labs(
    x = "Median household income, 2015",
    y = "Percentage of adult diabetes, 2013 ")  +  
  theme(text = element_text(size = 14), 
        axis.text.x = element_text(size = 10), 
        axis.text.y = element_text(size = 10))
## `geom_smooth()` using method = 'gam'
## Warning: Removed 4 rows containing non-finite values (stat_smooth).

## Warning: Removed 4 rows containing missing values (geom_point).

# Diabetes VS obesity
social_health_whole %>%
  group_by(state) %>%
  ggplot(aes(x = pct_obese_adults13, y = pct_diabetes_adults13)) +
  geom_point(aes(color = state, size = 1), alpha = .6) +
  geom_smooth() +
  labs(
    x = "Percentage of adult obesity, 2013",
    y = "Percentage of adult diabetes, 2013 ")  +  
  theme(text = element_text(size = 14), 
        axis.text.x = element_text(size = 10), 
        axis.text.y = element_text(size = 10))
## `geom_smooth()` using method = 'gam'
## Warning: Removed 1 rows containing non-finite values (stat_smooth).
## Warning: Removed 1 rows containing missing values (geom_point).

Lunch Program

#prepare dataset for lunch program
lunch = state[-52,] %>% 
  gather(key = "year", value = "lunch_participants", national_school_lunch_program_participants_fy_2009:national_school_lunch_program_participants_fy_2015) %>% 
  mutate(year = str_replace(year, "national_school_lunch_program_participants_fy_", "")) %>% 
  select(statefips, state, year, lunch_participants)

lunch_mean = lunch %>% 
  group_by(state) %>% 
  mutate(mean_participants = mean(lunch_participants)) %>% 
  ungroup(state) %>% 
  mutate(state = fct_reorder(state, mean_participants))

#Barplot of Average Number of Participants in School Lunch Program (2011-2015)
ggplot(lunch_mean, aes(x = state, y = mean_participants)) +
  geom_bar(stat = "identity", fill = "blue", alpha = .6) +
  labs(title = "Average Number of Participants in School Lunch Program (2011-2015)",
    x = "state",
    y = "Average number of participants") +
  theme(legend.position = "bottom",
        text = element_text(size = 12),
        axis.text.x = element_text(angle = 90, hjust = 1))

The relationship between lunch program and diabetes

health_13 = health %>% 
  group_by(state) %>% 
  mutate(diabetes_13 = mean(pct_diabetes_adults13, na.rm = T),
         obesites_13 = mean(pct_obese_adults13, na.rm = T)) %>% 
  mutate(statefips = str_sub(fips, 1, 2))%>% 
  filter(!duplicated(state)) %>% 
  ungroup(state) %>% 
  select(statefips, diabetes_13, obesites_13) %>% 
  left_join(state, by = "statefips") %>% 
  rename(lunch_2009 = "national_school_lunch_program_participants_fy_2009",
         lunch_2011 = "national_school_lunch_program_participants_fy_2011",
         lunch_2012 = "national_school_lunch_program_participants_fy_2012",
         lunch_2013 = "national_school_lunch_program_participants_fy_2013") %>% 
  select(statefips, state, diabetes_13, obesites_13, lunch_2009, lunch_2011, lunch_2012, lunch_2013)

#linear regression: diabetes_13 v.s. lunch participants (2013)
summary(lm(diabetes_13 ~ log(lunch_2013), data  = health_13))
## 
## Call:
## lm(formula = diabetes_13 ~ log(lunch_2013), data = health_13)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -4.2232 -1.2219 -0.0489  0.9121  4.5302 
## 
## Coefficients:
##                 Estimate Std. Error t value Pr(>|t|)   
## (Intercept)       1.7692     3.2166   0.550  0.58479   
## log(lunch_2013)   0.6963     0.2507   2.777  0.00775 **
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1.885 on 49 degrees of freedom
## Multiple R-squared:  0.136,  Adjusted R-squared:  0.1184 
## F-statistic: 7.712 on 1 and 49 DF,  p-value: 0.00775